# Rural livelihoods and the footprint of the coal industry in Jharkhand, India. 

# Script author: [ANON]

# Paper authors: [ANON]

# LOAD NECESSARY PACKAGES ------------------------------------------------------

#install.packages(pacman)
library(pacman)

# tidyverse
p_load(dplyr, tidyr, stringr, ggplot2, forcats, readxl)

# spatial (note you must use the same version of Java as R for OSM (32 vs 64))
p_load(sf, mapview, tmap, tmaptools, OpenStreetMap, osmdata)

# misc
p_load(here, patchwork, naniar, lubridate, stargazer, sjlabelled)

# custom
source(here("R", "functions.R"))

# Avoid scientific notation
options(scipen = 999)

# # Spatial data preparation (not required to be run) --------------------------
# 
# maindata <- read_xlsx(
#   here("Data", "Survey", "coalcommunities_jk_data.xlsx"),
#   sheet = 1, skip = 1)
# 
# crs <- "EPSG:7765"
# 
# state_sf <- st_read(here("Data","Spatial", "gadm36_IND_1.shp")) %>% 
#   select(state = NAME_1) %>% 
#   filter(state == "Jharkhand") %>% 
#   st_sf() %>% 
#   st_transform(crs = crs)
# 
# # Data source: Indian 2011 Census
# vill_sf <- st_read(here("Data","Spatial", "villages_jk.shp")) %>% 
#   st_transform(crs = crs)
# 
# # Data source: https://doi.org/10.1088/2633-1357/abdbbb
# mines_sf <- read_xlsx(here("Data", "Spatial", 
#                            "Indian Coal Mines Dataset_January 2021-1.xlsx"), 
#                       sheet = 2) %>% 
#   select(-Source) %>% 
#   st_as_sf(coords = c("Longitude", "Latitude"), crs = "EPSG:4326") %>% 
#   st_transform(crs = crs) %>% 
#   st_join(state_sf, left = FALSE)
# 
# # Data source: https://www.devdatalab.org/shrug
# vill_pc <- read.csv(here("Data", "Statistical",
#                          "shrug-v1.5.samosa-pop-econ-census-csv",
#                          "shrug-v1.5.samosa-pop-econ-census-csv",
#                          "shrug_pc11.csv")) %>% 
#   transmute(shrid = shrid,
#             CENSUS2011 = as.numeric(str_extract(shrid, pattern = "\\d{6}")),
#             hhs_11 = pc11_pca_no_hh,
#             pop_11 = pc11_pca_tot_p,
#             pop_rural_11 = pc11_pca_tot_p_r,
#             pop_urban_11 = pc11_pca_tot_p_u,
#             pop_sc_11 = pc11_pca_p_sc,
#             pop_st_11 = pc11_pca_p_st,
#             villarea_11 = pc11_vd_area)
# 
# vill_ec <- read.csv(here("Data", "Statistical",
#                          "shrug-v1.5.samosa-pop-econ-census-csv",
#                          "shrug-v1.5.samosa-pop-econ-census-csv",
#                          "shrug_ec.csv")) %>% 
#   transmute(shrid = shrid,
#             CENSUS2011 = as.numeric(str_extract(shrid, pattern = "\\d{6}")),
#             emp_all_13 = ec13_emp_all,
#             emp_manuf_13 = ec13_emp_manuf,
#             emp_serv_13 = ec13_emp_services)
# 
# vill_keys <- read.csv(here("Data", "Statistical",
#                            "shrug-v1.5.samosa-pop-econ-census-csv",
#                            "shrug-v1.5.samosa-keys-csv",
#                            "shrug_pc11_subdistrict_key.csv")) %>% 
#   transmute(shrid = shrid,
#             CENSUS2011 = as.numeric(str_extract(shrid, pattern = "\\d{6}")),
#             district = pc11_district_name,
#             subdistrict = pc11_subdistrict_name
#   )
# 
# vill_pcec <- left_join(vill_pc, vill_ec) %>% 
#   left_join(vill_keys) %>% 
#   filter(CENSUS2011 %in% vill_sf$CENSUS2011)
# 
# vill_sf <- left_join(vill_sf, vill_pcec)
# 
# # Filter out urban revenue villages (CENSUS2011 ID starts with 8)
# vill_sf <- vill_sf %>% 
#   filter(!str_starts(CENSUS2011, "8"))
# 
# # Extract key cities
# cities_jk <- opq(sf::st_bbox(st_transform(state_sf, crs = "EPSG:4326"))) %>% 
#   add_osm_feature(key = "place", value = "city") %>% 
#   osmdata_sf() 
# 
# cities_jk <- cities_jk$osm_points %>% 
#   filter(!is.na(name)) %>% 
#   select(name, geometry) %>% 
#   st_transform(crs = crs) %>% 
#   st_join(state_sf) %>% 
#   filter(!is.na(state))
# 
# # Calculate total coal production within different distances of villages
# vill_mine_5k <- st_is_within_distance(vill_sf, mines_sf, dist = 5000)
# vill_mine_10k <- st_is_within_distance(vill_sf, mines_sf, dist = 10000)
# vill_mine_20k <- st_is_within_distance(vill_sf, mines_sf, dist = 20000)
# 
# vill_coalprod_df <- tibble(
#   CENSUS2011 = vill_sf$CENSUS2011,
#   coalprod5k = sapply(vill_mine_5k,
#                       function(x)
#                         sum(mines_sf$`Coal/ Lignite Production (MT) (2019-2020)`[x])),
#   coalprod10k = sapply(vill_mine_10k,
#                        function(x)
#                          sum(mines_sf$`Coal/ Lignite Production (MT) (2019-2020)`[x])),
#   coalprod20k = sapply(vill_mine_20k,
#                        function(x)
#                          sum(mines_sf$`Coal/ Lignite Production (MT) (2019-2020)`[x])))
# 
# vill_sf <- vill_sf %>% 
#   left_join(vill_coalprod_df)
# 
# # Prepare sample frame, removing revenue villages with rural population below 100
# sampleframe <- vill_sf %>% 
#   tibble() %>% 
#   mutate(samplestrata = 
#            case_when(
#              coalprod5k > 5 ~ "5km_5MT",
#              coalprod5k > 0 ~ "5km_any",
#              coalprod10k > 5 ~ "10km_5MT",
#              coalprod10k > 0 ~ "10km_any",
#              coalprod20k > 5 ~ "20km_5MT",
#              coalprod20k > 0 ~ "20km_any",
#              TRUE ~ "NOT SAMPLED"
#            ),
#          samplestrata = factor(samplestrata, levels = c("5km_5MT", "5km_any",
#                                                         "10km_5MT", "10km_any",
#                                                         "20km_5MT", "20km_any")),
#          coalprod20k = coalprod20k - coalprod10k,
#          coalprod10k = coalprod10k - coalprod5k
#   ) %>% 
#   separate(samplestrata, sep = "_", into = c("distancestrata", "samplegroup"), remove = F) %>% 
#   filter(!is.na(pop_rural_11) & pop_rural_11 >= 100) %>% 
#   mutate(distancestrata = ifelse(is.na(distancestrata), "NOT SAMPLED", distancestrata),
#          distancestrata = factor(distancestrata, levels = c("5km","10km","20km","NOT SAMPLED")))
# 
# sampleframe_sf <- st_sf(sampleframe)
# 
# tmap_sampleframe <- 
#   tm_shape(read_osm(state_sf)) +
#   tm_rgb() +
#   
#   tm_shape(state_sf) +
#   tm_borders() +
#   
#   tm_shape(sampleframe_sf) +
#   tm_symbols(size = "pop_11", title.size = "Census '11 village population",
#              col = "distancestrata", title.col = "Distance strata", 
#              palette = "viridis", border.alpha = 0) +
#   
#   tm_shape(mines_sf) +
#   tm_symbols(size = "Coal/ Lignite Production (MT) (2019-2020)", shape = 24,
#              title.size = "Coal production (MT) ('19-'20)",
#              col = "brown", border.alpha = 0) +
#   
#   tm_shape(cities_jk) +
#   tm_symbols(shape = 21, size = 0.1, col = "black", border.alpha = 0) +
#   tm_text(text = "name", size = 0.8, col = "black") +
#   
#   tm_scale_bar(breaks = c(0, 20, 40, 60), text.size = 1, position = c("left", "bottom")) +
#   tm_compass(type = "4star", size = 2, position = c("left", "top")) +
#   tm_layout(legend.position = c("right", "bottom"),   legend.outside = T, 
#             title = "Survey sample frame")
# 
# tmap_save(tmap_sampleframe, filename = here("Manuscript", "Figures", "sampleframe.png"),
#                                             width = 8, height = 5, units = "in")
# # Merge sample with sampleframe data
# sample_sf <- maindata %>% 
#   select(q_9) %>% 
#   left_join(sampleframe_sf %>% select(distancestrata, 
#                                       samplegroup,
#                                       Households = hhs_11, 
#                                       geometry, CENSUS2011), 
#             by = c("q_9" = "CENSUS2011")) %>% 
#   distinct(q_9, .keep_all = T) %>% 
#   mutate(distancestrata = factor(distancestrata, levels = c("5km","10km","20km"))) %>% 
#   st_sf()
# 
# # Calculate proportion of households in each distance strata that live in 
# # revenue villages with either >= | < 5MT coal production in 2019/20
# # This is multiplied by desired sample per distance strata (67) to define total
# # sample per sub-strata (>= | < 5MT coal production).
# # Include actually sampled villages
# sampleframe %>% 
#   tibble() %>% 
#   group_by(distancestrata, samplegroup) %>% 
#   summarise(villages = n(),
#             hhs_11 = sum(hhs_11)) %>% 
#   mutate(sharevill = round(villages / sum(villages),2),
#          sharehhs = round(hhs_11 / (sum(hhs_11)),2),
#          samplebyhhshare = round(67 * sharehhs)) %>% 
#   filter(distancestrata != "NOT SAMPLED") %>% 
#   left_join(tibble(sample_sf) %>% group_by(distancestrata, samplegroup) %>% 
#               summarise("Sample achieved" = n())) %>% 
#   ungroup() %>% 
#   transmute("Distance strata" = as.character(distancestrata), 
#             "Production sub-strata" = as.character(samplegroup),
#          Villages = villages, Households = hhs_11, "Village share" = sharevill,
#          "Household share" = sharehhs, "Sampled villages" = samplebyhhshare) %>% 
#   stargazer(summary = F, rownames = F, out = here("Manuscript", "Tables", "samplesize.tex"))
# 
# tmap_sample <- 
#   tm_shape(read_osm(state_sf)) +
#   tm_rgb() +
#   
#   tm_shape(state_sf) +
#   tm_borders() +
#   
#   tm_shape(sample_sf) +
#   tm_symbols(size = "Households", title.size = "Census '11 village households",
#              col = "distancestrata", title.col = "Distance strata",
#              shape = "samplegroup", title.shape = "Production sub-strata",
#              palette = viridis::viridis(n = 3, begin = 0.1, end = 0.8), 
#              border.alpha = 0, jitter = 0.1) +
#   
#   tm_shape(cities_jk) +
#   tm_text(text = "name", size = 0.8, ) +
#   tm_symbols(shape = 21, size = 0.1, col = "black", border.alpha = 0) +
#   tm_scale_bar(breaks = c(0, 20, 40, 60), text.size = 1, position = c("left", "bottom")) +
#   tm_compass(type = "4star", size = 2, position = c("left", "top")) +
#   tm_layout(legend.position = c("right", "bottom"),   legend.outside = T, 
#             title = "Sampled villages")
# 
# tmap_save(tmap_sample, filename = here("Manuscript", "Figures", "sample_jitter.png"),
#           width = 8, height = 5, units = "in")
# 
# # Add in strata to household survey data
# maindata <- maindata %>% 
#   left_join(sampleframe_sf %>% select(distancestrata, 
#                                       samplegroup,
#                                       Households = hhs_11, 
#                                       geometry, CENSUS2011), 
#             by = c("q_9" = "CENSUS2011"))
# 
# # Save maindata to file for quicker reload
# # saveRDS(maindata, here("Data", "Survey", "coalcommunities_jk_data.rds"))

# READ IN DATA -----------------------------------------------------------------

maindata <- readRDS(here("Data", "Survey", "coalcommunities_jk_data.rds"))

codebook <- read_xlsx(
  here("Data", "Survey", "coalcommunities_jk_codebook.xlsx"))

choices <- read_xlsx(
  here("Data", "Survey", "coalcommunities_jk_choices.xlsx"))

# ANALYSIS ---------------------------------------------------------------------

# Descriptive analysis ---------------------------------------------------------

# Socio-economic characteristics
maindata %>% 
  transmute("Female" = as.numeric(q_18 == 0),
            "Age" = q_17,
            "Literate (Native Language)" = q_20,
            "Literate (Hindi)" = q_20_2,
            "Education 10th+" = as.numeric(q_21 %in% c(3,4,5)),
            "SC/ST" = as.numeric(q_23 %in% c(1,2)),
            "BPL" = as.numeric(q_26 %in% c(3,4)),
            "Adults in household" = q_24,
            "One member of trade union" = q_46,
            "Working toilet" = q_27,
            "Piped water" = q_28,
            "Grid connection" = q_29,
            "LPG connection" = q_31,
            "Owns land" = q_33,
            "Rooms in home" = q_34,
            "Monthly expenditures" = q_37) %>% 
  pivot_longer(everything()) %>% 
  mutate(value = ifelse(value < 0, NA, value),
         name = factor(name, levels = c("Female", "Age", "Literate (Native Language)",
                                        "Literate (Hindi)", "Education 10th+",
                                        "SC/ST", "BPL", "Adults in household",
                                        "One member of trade union",
                                        "Working toilet", "Piped water", "Grid connection",
                                        "LPG connection", "Owns land", "Rooms in home",
                                        "Monthly expenditures"))) %>% 
  group_by(name) %>% 
  summarise(Mean = round(mean(value, na.rm = T),2),
            SD = round(sd(value, na.rm = T),2),
            Min = min(value, na.rm = T),
            Max = max(value, na.rm = T),
            Median = median(value, na.rm = T),
            Missing = sum(is.na(value))) %>% 
  rename(Variable = name) %>%
  mutate(Variable = as.character(Variable)) %>% 
  stargazer(summary = F, rownames = F, 
            out = here("Manuscript", "Tables", "se_summarystats.tex"),
            float = F)

# Socio-economic characteristics - by distance strata
maindata %>% 
  transmute("Female" = as.numeric(q_18 == 0),
            "Age" = q_17,
            "Literate (Native Language)" = q_20,
            "Literate (Hindi)" = q_20_2,
            "Education 10th+" = as.numeric(q_21 %in% c(3,4,5)),
            "SC/ST" = as.numeric(q_23 %in% c(1,2)),
            "BPL" = as.numeric(q_26 %in% c(3,4)),
            "Adults in household" = q_24,
            "One member of trade union" = q_46,
            "Working toilet" = q_27,
            "Piped water" = q_28,
            "Grid connection" = q_29,
            "LPG connection" = q_31,
            "Owns land" = q_33,
            "Rooms in home" = q_34,
            "Monthly expenditures" = q_37,
            "Distance strata" = factor(distancestrata, levels = c("5km", "10km", "20km"),
                                    labels = c("0km-5km", "5km-10km", "10km-20km"))) %>% 
  pivot_longer(-`Distance strata`) %>% 
  mutate(value = ifelse(value < 0, NA, value),
         name = factor(name, levels = c("Female", "Age", "Literate (Native Language)",
                                        "Literate (Hindi)", "Education 10th+",
                                        "SC/ST", "BPL", "Adults in household",
                                        "One member of trade union",
                                        "Working toilet", "Piped water", "Grid connection",
                                        "LPG connection", "Owns land", "Rooms in home",
                                        "Monthly expenditures"))) %>% 
  group_by(`Distance strata`, name) %>% 
  summarise(Mean = round(mean(value, na.rm = T),2),
            SD = round(sd(value, na.rm = T),2),
            Min = min(value, na.rm = T),
            Max = max(value, na.rm = T),
            Median = median(value, na.rm = T),
            Missing = sum(is.na(value))) %>% 
  rename(Variable = name) %>%
  mutate(Variable = as.character(Variable),
         `Distance strata` = as.character(`Distance strata`)) %>% 
  stargazer(summary = F, rownames = F, 
            out = here("Manuscript", "Tables", "se_summarystats_strata.tex"),
            float = F)

# Job type for top earner
a <- maindata %>% 
  select("q_38_2_1", KEY, distancestrata) %>% 
  mutate(distancestrata = factor(distancestrata, levels = c("5km", "10km", "20km"),
                                 labels = c("[0km-5km]", "(5km-10km]", "(10km-20km]"))) %>% 
  pivot_longer(-c(KEY, distancestrata), 
               names_pattern = "(.+)_(\\d+)", 
               names_to = c(".value", "set")) %>% 
  mutate(q_38_2 = labeler(., "q_38_2", "choices_q_38_2", choices)
  ) %>%
  group_by(distancestrata) %>% 
  mutate(n = n()) %>% 
  dplyr::count(distancestrata, n, JobType = q_38_2) %>% 
  mutate(prop = round(nn/n,2),
         se = sqrt((prop*(1-prop))/n)) %>%
  select(-c(nn,n)) %>% 
  mutate(JobType = fct_reorder(JobType, prop)) %>% 
  ggplot(aes(JobType, prop, ymin = prop-1.96*se, ymax = prop+1.96*se, colour = distancestrata)) +
  geom_point(position = position_dodge2(width = 0.3)) +
  geom_errorbar(position = position_dodge2(width = 0.3), width = 0.3) +
  coord_flip() +
  theme_bw() +
  scale_colour_brewer(type = "qual", direction = -1) +
  labs(x = NULL,
       y = "Proportion of all households",
       subtitle = "Main income earner job type ...",
       colour = "Distance to mine") +
  theme(legend.position = "bottom")
  
# Job sector for top earner
b <- maindata %>% 
  select("q_38_3_1", KEY, distancestrata) %>% 
  mutate(distancestrata = factor(distancestrata, levels = c("5km", "10km", "20km"),
                                 labels = c("[0km-5km]", "(5km-10km]", "(10km-20km]"))) %>% 
  pivot_longer(-c(KEY, distancestrata), 
               names_pattern = "(.+)_(\\d+)", 
               names_to = c(".value", "set")) %>% 
  # Note: some households indicated two sectors. For the purposes of this analysis
  # we consider their first response the primary sector.
  mutate(q_38_3 = ifelse(str_length(q_38_3) > 2, strtrim(q_38_3, 1), q_38_3)) %>% 
  mutate(q_38_3 = labeler(., "q_38_3", "choices_q_38_3", choices),
         q_38_3 = str_remove(q_38_3, pattern = " Industry")
         ) %>%
  group_by(distancestrata) %>% 
  mutate(n = n()) %>% 
  dplyr::count(distancestrata, n, JobSector = q_38_3) %>% 
  mutate(prop = round(nn/n,2),
         se = sqrt((prop*(1-prop))/n)) %>%
  select(-c(nn,n)) %>% 
  mutate(JobSector = fct_reorder(JobSector, prop)) %>% 
  ggplot(aes(JobSector, prop, ymin = prop-1.96*se, ymax = prop+1.96*se, colour = distancestrata)) +
  geom_point(position = position_dodge2(width = 0.3)) +
  geom_errorbar(position = position_dodge2(width = 0.3), width = 0.3) +
  coord_flip() +
  theme_bw() +
  scale_colour_brewer(type = "qual", direction = -1) +
  labs(x = NULL,
       y = "Proportion of all households",
       subtitle = "Main income earner job sector ...",
       colour = "Distance to mine") +
  theme(legend.position = "bottom")

# c <- maindata %>% 
#   select("q_38_3_1", q_39, KEY, distancestrata) %>% 
#   mutate(distancestrata = factor(distancestrata, levels = c("5km", "10km", "20km"),
#                                  labels = c("[0km-5km]", "(5km-10km]", "(10km-20km]"))) %>% 
#   pivot_longer(-c(q_39, KEY, distancestrata), 
#                names_pattern = "(.+)_(\\d+)", 
#                names_to = c(".value", "set")) %>% 
#   separate_rows(q_38_3, sep = " ") %>% 
#   mutate(q_38_3 = labeler(., "q_38_3", "choices_q_38_3", choices)) %>%
#   group_by(distancestrata) %>% 
#   mutate(n = n()) %>% 
#   dplyr::count(distancestrata, n, Migrated = q_39) %>% 
#   mutate(prop = round(nn/n,2),
#          se = sqrt((prop*(1-prop))/n),
#          distancestrata = paste0(distancestrata, " to mine")) %>%
#   select(-c(nn,n)) %>% 
#   filter(Migrated == 1) %>% 
#   ggplot(aes(distancestrata, prop, ymin = prop-1.96*se, ymax = prop+1.96*se, colour = distancestrata)) +
#   geom_point(position = position_dodge2(width = 0.3), show.legend = F) +
#   geom_errorbar(position = position_dodge2(width = 0.3), width = 0.3,
#                 show.legend = F) +
#   coord_flip(ylim = c(0, 0.5)) +
#   theme_bw() +
#   scale_colour_brewer(type = "qual", direction = -1) +
#   labs(x = NULL,
#        y = "Proportion of all households",
#        subtitle = "Proportion that migrated to their current location for work.",
#        colour = "Distance to mine") +
#   theme(legend.position = "bottom") +
#   guides(colour = "none")

jobsectors <- wrap_plots(a,b, heights = c(0.7,1)) + plot_layout(guides = "collect") + 
  plot_annotation(tag_levels = "A") &
  theme(legend.position = "bottom") &
  guides(colour=guide_legend(title.position="left", 
                             title.hjust =0.5))

ragg::agg_png(filename = here("Manuscript", "Figures", "jobsectors.png"),
              height = 12, width = 12, units = "in", res = 300, scaling = 2)
jobsectors
dev.off()

# Non-coal industry loss of income following closures
c <- maindata %>% 
  select(distancestrata, q_38_3_1, q_38_5_1) %>% 
  mutate(distancestrata = factor(distancestrata, levels = c("5km", "10km", "20km"),
                                 labels = c("[0km-5km]", "(5km-10km]", "(10km-20km]"))) %>% 
  mutate(nhouseholds = sum(!grepl(q_38_3_1, pattern = 2)),
         q_38_3_1 = ifelse(str_length(q_38_3_1) > 2, strtrim(q_38_3_1, 1), q_38_3_1)) %>% 
  transmute(
    distancestrata = distancestrata,
    nhouseholds = nhouseholds,
    q_38_3_1 = labeler(., "q_38_3_1", "choices_q_38_3", choices),
    q_38_5_1 = labeler(., "q_38_5_1", "choices_q_38_5", choices)) %>% 
  mutate(q_38_5_1 = ifelse(q_38_5_1 %in% c("Somewhat", "Very much"), "At least somewhat", "Not at all")) %>% 
  dplyr::count(nhouseholds, distancestrata, q_38_5_1) %>% 
  group_by(distancestrata) %>% 
  mutate(prop = n / sum(n),
         se = sqrt((prop*(1-prop))/sum(n))) %>% 
  mutate(q_38_5_1 = factor(q_38_5_1, levels = c("Not at all",
                                                "At least somewhat"))) %>% 
  filter(sum(n) >= 20, q_38_5_1 == "At least somewhat") %>% 
  ggplot(aes(distancestrata, prop, ymin = prop-1.96*se, ymax = prop+1.96*se, colour = distancestrata)) +
  geom_point(position = position_dodge2(width = 0.3), size = 3, show.legend = F) +
  geom_errorbar(position = position_dodge2(width = 0.3), width = 0.3, show.legend = F) +
  coord_flip(ylim = c(0,.5)) +
  scale_y_continuous(breaks = scales::pretty_breaks()) +
  scale_colour_brewer(type = "qual", direction = -1) +
  theme_bw() +
  labs(x = NULL,
       y = "Proportion of households (not working in coal sector)",
       title = "Mine closure would [somewhat | very much] affect main earner income.")

d <- maindata %>% 
  select(distancestrata, q_38_3_1, q_41) %>% 
  mutate(distancestrata = factor(distancestrata, levels = c("5km", "10km", "20km"),
                                 labels = c("[0km-5km]", "(5km-10km]", "(10km-20km]")),
         nhouseholds = sum(!grepl(q_38_3_1, pattern = 2)),
         q_38_3_1 = ifelse(str_length(q_38_3_1) > 2, strtrim(q_38_3_1, 1), q_38_3_1)
         ) %>% 
  transmute(
    distancestrata = distancestrata,
    nhouseholds = nhouseholds,
    q_38_3_1 = labeler(., "q_38_3_1", "choices_q_38_3", choices),
    q_41 = labeler(., "q_41", "choices_q_41", choices)) %>% 
  mutate(q_41 = ifelse(q_41 %in% c("Somewhat important", "Very important"), "At least somewhat", "Not at all")) %>% 
  dplyr::count(nhouseholds, distancestrata, q_41) %>% 
  group_by(distancestrata) %>% 
  mutate(prop = n / sum(n),
         se = sqrt((prop*(1-prop))/sum(n))) %>% 
  mutate(q_41 = factor(q_41, levels = c("Not at all","At least somewhat"))) %>% 
  filter(sum(n) >= 20, q_41 == "At least somewhat") %>% 
  ggplot(aes(distancestrata, prop, ymin = prop-1.96*se, ymax = prop+1.96*se, colour = distancestrata)) +
  geom_point(position = position_dodge2(width = 0.3), size = 3, show.legend = F) +
  geom_errorbar(position = position_dodge2(width = 0.3), width = 0.3, show.legend = F) +
  coord_flip(ylim = c(0,.5)) +
  scale_y_continuous(breaks = scales::pretty_breaks()) +
  scale_colour_brewer(type = "qual", direction = -1) +
  theme_bw() +
  labs(x = NULL,
       y = "Proportion of households (not working in coal sector)",
       title = "Coal industry is [somewhat | very] important for household livelihoods.")

e <- maindata %>% 
  select(KEY, distancestrata, q_43_1:q_43_7) %>% 
  mutate(distancestrata = factor(distancestrata, levels = c("5km", "10km", "20km"),
                                 labels = c("[0km-5km]", "(5km-10km]", "(10km-20km]"))) %>% 
  pivot_longer(-c(distancestrata,KEY)) %>% 
  mutate(value = labeler(., "value", "choices_q_43", choices),
         value = ifelse(value %in% c("Somewhat important", "Very important"), "At least somewhat", "Not at all")) %>% 
  dplyr::count(distancestrata, name, value) %>% 
  group_by(distancestrata, name) %>% 
  mutate(prop = n/sum(n),
         se = sqrt((prop*(1-prop))/sum(n)),
         name = case_when(
           name == "q_43_1" ~ "Agriculture",
           name == "q_43_2" ~ "Trades/Manufacturing",
           name == "q_43_3" ~ "Retail",
           name == "q_43_4" ~ "Education",
           name == "q_43_5" ~ "Health",
           name == "q_43_6" ~ "Administration (Government)",
           name == "q_43_7" ~ "Coal"),
         name = factor(name, levels = levels(b$data$JobSector))) %>% 
  filter(value == "At least somewhat") %>% 
  ggplot(aes(name, prop, ymin = prop-1.96*se, ymax = prop+1.96*se, colour = distancestrata)) +
  geom_point(position = position_dodge2(width = 0.3), size = 3) +
  geom_errorbar(position = position_dodge2(width = 0.3), width = 0.3) +
  coord_flip(ylim = c(0,1)) +
  scale_y_continuous(breaks = scales::pretty_breaks()) +
  scale_colour_brewer(type = "qual", direction = -1) +
  theme_bw() +
  labs(x = NULL,
       y = "Proportion of all households",
       title = " ... sector is [somewhat | very] important for village livelihoods.",
       colour = "Distance to coal mine") 

dc <- wrap_plots(d/c) +
  plot_layout(guides = "collect") & theme(legend.position = "bottom")

industryimportance_all <- wrap_plots(e, dc) +
  plot_layout(guides = "collect", widths = c(1,1)) + 
  plot_annotation(tag_levels = "A") & theme(legend.position = "bottom") &
  theme(title = element_text(size = 7), 
        axis.title = element_text(size = 8), legend.title = element_text(size = 8))

ragg::agg_png(filename = here("Manuscript", "Figures", "industryimportance_all.png"),
              height = 7, width = 15, units = "in", res = 300, scaling = 1.4)
industryimportance_all
dev.off()

# Trust and perception
f_1_data <- maindata %>% 
  select(q_47, distancestrata) %>% 
  mutate(distancestrata = factor(distancestrata, levels = c("5km", "10km", "20km"),
                                 labels = c("[0km-5km]", "(5km-10km]", "(10km-20km]"))) %>% 
  transmute(
    distancestrata = distancestrata,
    q_47 = labeler(., "q_47", "choices_q_47", choices)) %>% 
  mutate(q_47 = ifelse(q_47 %in% c("Somewhat", "Very much"), "At least somewhat", "Not at all")) %>% 
  dplyr::count(distancestrata, q_47) %>% 
  group_by(distancestrata) %>% 
  mutate(prop = n / sum(n),
         se = sqrt((prop*(1-prop))/sum(n))) %>% 
  mutate(q_47 = factor(q_47, levels = c("Not at all","At least somewhat"))) %>% 
  filter(q_47 == "At least somewhat")

f_2_data <- maindata %>% 
  select(q_48, distancestrata) %>% 
  mutate(distancestrata = factor(distancestrata, levels = c("5km", "10km", "20km"),
                                 labels = c("[0km-5km]", "(5km-10km]", "(10km-20km]"))) %>% 
  transmute(
    distancestrata = distancestrata,
    q_48 = labeler(., "q_48", "choices_q_47", choices)) %>% 
  mutate(q_48 = ifelse(q_48 %in% c("Somewhat", "Very much"), "At least somewhat", "Not at all")) %>% 
  dplyr::count(distancestrata, q_48) %>% 
  group_by(distancestrata) %>% 
  mutate(prop = n / sum(n),
         se = sqrt((prop*(1-prop))/sum(n))) %>% 
  mutate(q_48 = factor(q_48, levels = c("Not at all","At least somewhat"))) %>% 
  filter(q_48 == "At least somewhat")

f <- rbind(f_1_data %>% mutate(q = "State Govt."), 
      f_2_data %>% mutate(q = "Coal Company")) %>% 
  ggplot(aes(x = q, prop, ymin = prop-1.96*se, ymax = prop+1.96*se, colour = distancestrata)) +
  geom_point(position = position_dodge2(width = 0.3, preserve = "single"), size = 3, show.legend = F) +
  geom_errorbar(position = position_dodge2(width = 0.3, preserve = "single"), width = 0.3, show.legend = F) +
  coord_flip(ylim = c(0,1)) +
  scale_y_continuous(breaks = scales::pretty_breaks()) +
  scale_colour_brewer(type = "qual", direction = -1) +
  theme_bw() +
  labs(x = NULL,
       y = "Proportion of all households",
       title = "I trust .... [somewhat | very much] to improve livelihoods in our area.",
       colour = "Distance to coal mine")

g <- maindata %>% 
  select(KEY, distancestrata, q_49_1:q_49_4) %>% 
  mutate(distancestrata = factor(distancestrata, levels = c("5km", "10km", "20km"),
                                 labels = c("[0km-5km]", "(5km-10km]", "(10km-20km]"))) %>% 
  pivot_longer(-c(distancestrata,KEY)) %>% 
  mutate(value = labeler(., "value", "choices_yn", choices)) %>% 
  dplyr::count(distancestrata, name, value) %>% 
  group_by(distancestrata, name) %>% 
  mutate(prop = n/sum(n),
         se = sqrt((prop*(1-prop))/sum(n)),
         name = case_when(
           name == "q_49_1" ~ "Clinics / hospitals",
           name == "q_49_2" ~ "Schools",
           name == "q_49_3" ~ "Electricity supply",
           name == "q_49_4" ~ "Water supply")) %>% 
  filter(value == "Yes") %>% 
  ggplot(aes(name, prop, ymin = prop-1.96*se, ymax = prop+1.96*se, colour = distancestrata)) +
  geom_point(position = position_dodge2(width = 0.3), size = 3) +
  geom_errorbar(position = position_dodge2(width = 0.3), width = 0.3) +
  coord_flip(ylim = c(0,1)) +
  scale_y_continuous(breaks = scales::pretty_breaks()) +
  scale_colour_brewer(type = "qual", direction = -1) +
  theme_bw() +
  labs(x = NULL,
       y = "Proportion of all households",
       title = "The coal company has improved .... in our area.",
       colour = "Distance to coal mine") 

trustandperceptions_all <- wrap_plots(g, f, ncol = 2) +
  plot_layout(guides = "collect") + 
  plot_annotation(tag_levels = "A") & theme(legend.position = "bottom") &
  theme(title = element_text(size = 7), 
        axis.title = element_text(size = 8), legend.title = element_text(size = 8))

ragg::agg_png(filename = here("Manuscript", "Figures", "trustandperceptions_all.png"),
              height = 7, width = 15, units = "in", res = 300, scaling = 1.4)
trustandperceptions_all
dev.off()

# Alternative livelihood investments

h_1_data <- maindata %>% 
  select(q_56, distancestrata) %>% 
  mutate(distancestrata = factor(distancestrata, levels = c("5km", "10km", "20km"),
                                 labels = c("[0km-5km]", "(5km-10km]", "(10km-20km]")),
         q_56 = ifelse(q_56 %in% c(4,5), 4, 2)) %>% 
  transmute(
    distancestrata = distancestrata,
    q_56 = labeler(., "q_56", "choices_q_56", choices)) %>% 
  dplyr::count(distancestrata, q_56) %>% 
  group_by(distancestrata) %>% 
  mutate(prop = n / sum(n),
         se = sqrt((prop*(1-prop))/sum(n))) %>% 
  filter(q_56 == "Helpful") %>% 
  mutate(q = "Unskilled labour job guarantee") %>% 
  select(-q_56)

h_2_data <- maindata %>% 
  select(q_57, distancestrata) %>% 
  mutate(distancestrata = factor(distancestrata, levels = c("5km", "10km", "20km"),
                                 labels = c("[0km-5km]", "(5km-10km]", "(10km-20km]")),
         q_57 = ifelse(q_57 %in% c(4,5), 4, 2)) %>% 
  transmute(
    distancestrata = distancestrata,
    q_57 = labeler(., "q_57", "choices_q_56", choices)) %>% 
  dplyr::count(distancestrata, q_57) %>% 
  group_by(distancestrata) %>% 
  mutate(prop = n / sum(n),
         se = sqrt((prop*(1-prop))/sum(n))) %>% 
  filter(q_57 == "Helpful") %>% 
  mutate(q = "Free job or trade training") %>% 
  select(-q_57)

h_3_data <- maindata %>% 
  select(q_58, distancestrata) %>% 
  mutate(distancestrata = factor(distancestrata, levels = c("5km", "10km", "20km"),
                                 labels = c("[0km-5km]", "(5km-10km]", "(10km-20km]"))) %>% 
  mutate(q_58 = ifelse(q_58 %in% c("4", "5"), "4", "2")) %>% 
  transmute(
    distancestrata = distancestrata,
    q_58 = labeler(., "q_58", "choices_q_58", choices)) %>% 
  dplyr::count(distancestrata, q_58) %>% 
  group_by(distancestrata) %>% 
  mutate(prop = n / sum(n),
         se = sqrt((prop*(1-prop))/sum(n))) %>% 
  mutate(q_58 = factor(q_58, levels = c("Not important","Important"))) %>% 
  filter(q_58 == "Important") %>% 
  mutate(q = "Govt. investment in non-coal jobs") %>% 
  select(-q_58)

h <- list(h_1_data, h_2_data) %>% 
  purrr::reduce(rbind) %>% 
  ggplot(aes(q, prop, ymin = prop-1.96*se, ymax = prop+1.96*se, colour = distancestrata)) +
  geom_point(position = position_dodge2(width = 0.1), size = 3) +
  geom_errorbar(position = position_dodge2(width = 0.1), width = 0.1) +
  scale_y_continuous(breaks = scales::pretty_breaks(), limits = c(0,1)) +
  scale_colour_brewer(type = "qual", direction = -1) +
  theme_bw() +
  coord_flip() +
  labs(x = NULL,
       y = "Proportion of all households",
       title = "... would be [helpful | very helpful] in case of mine closers.",
       colour = "Distance to coal mine") 

i <- maindata %>% 
  select(q_58_1, distancestrata) %>% 
  mutate(distancestrata = factor(distancestrata, levels = c("5km", "10km", "20km"),
                                 labels = c("[0km-5km]", "(5km-10km]", "(10km-20km]"))) %>% 
  transmute(
    distancestrata = distancestrata,
    q_58_1 = labeler(., "q_58_1", "choices_q_58_1", choices)) %>% 
  dplyr::count(distancestrata, q_58_1) %>% 
  group_by(distancestrata) %>% 
  mutate(prop = n / sum(n),
         se = sqrt((prop*(1-prop))/sum(n))) %>% 
  ggplot(aes(q_58_1, prop, ymin = prop-1.96*se, ymax = prop+1.96*se, colour = distancestrata)) +
  geom_point(position = position_dodge2(width = 0.3), size = 3) +
  geom_errorbar(position = position_dodge2(width = 0.3), width = 0.3) +
  scale_y_continuous(breaks = scales::pretty_breaks(), limits = c(0,1)) +
  scale_colour_brewer(type = "qual", direction = -1) +
  theme_bw() +
  coord_flip() +
  labs(x = NULL,
       y = "Proportion of households (of those stating important)",
       title = "The State Govt. should invest in new jobs ...",
       colour = "Distance to coal mine") 

altlivelihoods <- wrap_plots(h, i, ncol = 2) +
  plot_layout(guides = "collect") + 
  plot_annotation(tag_levels = "A") & theme(legend.position = "bottom") &
  theme(title = element_text(size = 7), axis.title = element_text(size = 8), 
        legend.title = element_text(size = 8))

ragg::agg_png(filename = here("Manuscript", "Figures", "altlivelihoods.png"),
              height = 7, width = 15, units = "in", res = 300, scaling = 1.4)
altlivelihoods
dev.off()


  
